Data loading

library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")

Read the data.

load("cache/corpus-lsh.rda")

Spectrograms

source("R/spectrogram.R")

Spectrogram of borrowings in CA1850. I expect it will show heavily reliance on NY1849 amidst original material.

spectrogram("CA1850", best_matches, white_list = 7)
## Loading required package: ggplot2

Spectrogram of borrowings in MD1855. I expect to see reliance on English legislation (GB1852 and GB1854), amidst original material.

spectrogram("MD1855", best_matches, white_list = 7)

Spectrogram of borrowings in WA1855. An earlier version show two long runs of borrowings from Oregon and Indiana. If the latest data, still visualizes that kind of splicing, I think it’s a great illustration.

spectrogram("WA1855", best_matches, white_list = 8)

The point of this and the next exercise is not to show borrowings from all sources, but similarity between two sources. Our normal spectrogram of NV1861 will show heavy reliance on California. I want this to show that even though Nevada is two steps away from New York, it’s text is still very similar to New York throughout.

all_NV1861 <- data_frame(borrower_code = "NV1861",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "NV1861") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "NV1861",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1861) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

NV1861toNY <- all_NV1861 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("NV1861", NV1861toNY, white_list = 5,
            title = "Sections in NV1861 with high similarity to NY codes")

Like the previous query for NV1861, Iowa is two steps away from New York. This time, however, the text is much less similar to New York. I only want one visualization for this, but I’m not sure if IA1851 or the revision IA1859 will be better, so can we do both?

IA1851

all_IA1851 <- data_frame(borrower_code = "IA1851",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "IA1851") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "IA1851",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1851) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

IA1851toNY <- all_IA1851 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1851", IA1851toNY, white_list = 10,
            title = "Sections in IA1851 with matches to NY codes")

IA1859

all_IA1859 <- data_frame(borrower_code = "IA1859",
                         borrower_section = all_matches %>% 
                           filter(borrower_code == "IA1859") %>% 
                           `$`("borrower_section") %>%
                           unique() %>%
                           sort()
                        )

matches_to_NY <- all_matches %>% 
  filter(borrower_code == "IA1859",
         str_detect(match_code, "NY"),
         score >= 0.1,
         match_year <= 1859) %>% 
  group_by(borrower_section) %>% 
  arrange(desc(score)) %>% 
  slice(1) %>% 
  ungroup() 

IA1859toNY <- all_IA1859 %>% left_join(matches_to_NY) %>% 
  arrange(borrower_section) 
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1859", IA1859toNY, white_list = 10,
            title = "Sections in IA1859 with matches to NY codes")

Spectrogram of borrowings in NC1868. Should show heavy reliance on one or another New York Code, with scattered provisions coming from elsewhere.

spectrogram("NC1868", best_matches, white_list = 8)

Spectrogram of borrowings in CO1868. Should show heavy reliance on Illinois law.

spectrogram("CO1868", best_matches, white_list = 8)

Same query as number 6 and 7, but like Louisiana, the point is to show that no one borrows the later New York Code, the one that departs from the Field tradition.

spectrogram("NY1876", best_matches, white_list = 8)

spectrogram("NY1879", best_matches, white_list = 8)

Here I’m looking for the reverse of the previous spectrograms. Rather than illustrate sections BORROWED FROM elsewhere, can we illustrate sections in the original Field Code that GET BORROWED elsewhere, and instead of coloring by where they end up, color based on how many times a particular section appears throughout the corpus?

I expect the beginning of the code will be grey–no one has New York’s particular court system, so no one copies the jurisdictional texts, then high heat at the heart of the Code with the Field reforms, slackening off around particular proceedings, some of which will be New York specific, then heating up again towards the end with the code of evidence.

Network graphs

Create a network graph based on section percentages.

edges_pct <- summary_matches %>% 
  filter(percent_borrowed >= 0.05,
         !is.na(match_code)) %>% 
  select(borrower_code, match_code, weight = percent_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
## 
##    borrower_code match_code weight
##            (chr)      (chr)  (dbl)
## 1         AK1900     OR1862 0.5937
## 2         AR1868     KY1851 0.3634
## 3         AR1868     KY1854 0.3158
## 4         AR1874     AR1868 0.6752
## 5         AR1874     KY1851 0.0818
## 6         AZ1865     CA1851 0.5476
## 7         AZ1865     CA1858 0.2590
## 8         AZ1887     CA1872 0.4313
## 9         CA1850     NY1849 0.2972
## 10        CA1850     NY1850 0.1207
## ..           ...        ...    ...
g <- graph_from_data_frame(edges_pct, directed = TRUE) 
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>% 
  add_rownames() %>% 
  rename(name = rowname, distance = NY1850) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes) 
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)

g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())

plot_before_year <- function(x, year) {
  x_before <- induced.subgraph(x, which(V(x)$year <= year))
  n <- V(x)$name
  n_before <- V(x_before)$name
  filter <- n %in% n_before
  x_before$layout <- x_before$layout[filter, ]
  par(mar = c(0,0,1,0))
  plot(x_before, edge.width = E(x_before)$weight * 8,
       edge.arrow.size = 0.0, vertex.size = 5)
  title(paste0("Codes of Civil Procedure before ", year))
} 

for (i in seq(1850, 1900, 5)) {
  plot_before_year(g, i)
}

Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.

minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
  # "CO868", 
  # "CT1879", 
  # "FL1847", 
  # "FL1892", 
  "GA1851",
  "GA1860", 
  "HI1859", 
  "HI1897",
  # "IL1866", 
  "LA1825", 
  "LA1844" 
#   "MS1848", 
#   "MS1857", 
  # "NY1876", 
  # "NY1879",
  # "VA1860", 
  # "VA1893", 
  # "WV1868" 
  )
# codes_not_to_plot <- NULL

edges_n <- summary_matches %>% 
  filter(!is.na(match_code),
         sections_borrowed >= minimum_n) %>%  
  select(borrower_code, match_code, weight = sections_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight)) %>% 
  filter(!borrower_code %in% codes_not_to_plot,
         !match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
## 
##    borrower_code match_code      weight
##            (chr)      (chr)       (dbl)
## 1         AK1900     OR1862 0.398192771
## 2         AK1900     NY1850 0.002409639
## 3         AK1900     OR1854 0.002409639
## 4         AR1868     KY1851 0.209036145
## 5         AR1868     KY1854 0.180120482
## 6         AR1874     AR1868 0.146987952
## 7         AR1874     KY1851 0.007228916
## 8         AZ1865     CA1851 0.199397590
## 9         AZ1865     CA1858 0.087951807
## 10        AZ1887     CA1872 0.203614458
## ..           ...        ...         ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE) 
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
                     algorithm = "unweighted") %>% 
                     apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n) 
V(g_n)$year <- V(g_n)$name %>% extract_date()

edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
  w <- E(g)$weight * multiplier
  w[w > max_val] <- max_val
  w[w < min_val] <- min_val
  w
}

set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
                           normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")

Now do a state to state network:

min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>% 
  mutate(borrower_date = extract_date(borrower_code),
         match_date = extract_date(match_code),
         borrower_state = extract_state(borrower_code),
         match_state = extract_state(match_code)) %>% 
  filter(!is.na(match_code),
         borrower_date >= match_date,
         borrower_state != match_state) %>% 
  group_by(borrower_state, match_state) %>% 
  summarize(n = sum(sections_borrowed)) %>% 
  filter(n >= min_state_borrowings) %>% 
  select(borrower_state, match_state, weight = n) %>% 
  group_by(borrower_state) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight))

edges_states
## Source: local data frame [44 x 3]
## 
##    borrower_state match_state     weight
##             (chr)       (chr)      (dbl)
## 1              AK          OR 0.57655039
## 2              AR          KY 0.60562016
## 3              AZ          CA 0.75484496
## 4              CA          NY 0.69670543
## 5              CO          CA 0.05910853
## 6              CO          IL 0.04166667
## 7              DC          IN 0.11821705
## 8              DT          ND 0.22674419
## 9              DT          NE 0.19476744
## 10             FL          NY 0.03488372
## ..            ...         ...        ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)

state_distances <- distances(g_states, to = "NY", algorithm = "unweighted") 
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
                           distance = state_distances[, 1]) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_states <- graph_from_data_frame(edges_states, directed = TRUE,
                                  vertices = nodes_states) %>% 
  decompose(min.vertices = 3) %>% 
  `[[`(1)

set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
                                                   spring.length = 25),
                                     normalize())
par(mar = c(0,0,1,0))
plot(g_states, 
     edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
     edge.arrow.mode = 1,
     vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")